videogames.df <- read.csv(file.path(project.dir, dataset.dir, 'vgsales-12-4-2019.csv'))
colnames(videogames.df)
## [1] "Rank" "Name" "basename" "Genre"
## [5] "ESRB_Rating" "Platform" "Publisher" "Developer"
## [9] "VGChartz_Score" "Critic_Score" "User_Score" "Total_Shipped"
## [13] "Global_Sales" "NA_Sales" "PAL_Sales" "JP_Sales"
## [17] "Other_Sales" "Year" "Last_Update" "url"
## [21] "status" "Vgchartzscore" "img_url"
# Since the data was collected in April of 2019, we are excluding games with year = 2019 since it does not give a comprehensive picture of all the sales during 2019.
videogames.clean <- videogames.df %>% filter(Year < 2019)
# E was originally called KA for ESRB ratings, so we are going to make all the KA ratings E
videogames.clean <- videogames.clean %>% mutate(ESRB_Rating = replace(ESRB_Rating, ESRB_Rating=='KA', 'E'))
# Make give the ESRB rating levels for easier graphing/ data manipulation
unique(videogames.clean$ESRB_Rating)
## [1] "E" "" "M" "E10" "T" "RP" "EC" "AO"
videogames.clean$ESRB_Rating <- factor(videogames.clean$ESRB_Rating,levels = c('','RP','E', 'EC', 'E10','T','M','AO'))
We want to compare sales across different regions, so it would be convenient to have one column ‘region’ and then a corresponding column for sales in USD (millions).
vs_byregion <- videogames.clean %>% gather(Region, Sales, Global_Sales:Other_Sales, na.rm = T)
Conduct some descriptive analysis on the data, figuring out: * distributions of variables, * variables that appear to be strongly related with each other (using appropriate methods to quantify the relationships based on whether variables are numerical or categorical).
From the boxplot we can see that we have 2 extreme outliers. After investigating, it looks like two outliers are GTA V (ps3 and ps4)
boxplot(videogames.clean$Global_Sales, xlab = 'Global Sales (millions of USD)')
videogames.clean[which(videogames.clean$Global_Sales > 17), ]
hist(videogames.clean$Global_Sales,
xlab = 'Global Sales (millions of USD)',
xlim = c(0, .5),
breaks = 2000)
ggplotly(
videogames.clean %>%
count(Platform, sort = TRUE) %>%
ggplot(aes(x = reorder(Platform, -n), y = n)) +
geom_bar(stat = "identity",position = position_dodge(width=0)) +
theme(axis.text.x=element_text(angle=90,hjust=1, vjust = 0.5))
)
videogames.clean %>% ggplot(aes(x = ESRB_Rating)) +
geom_bar()
videogames.clean %>%
count(Genre, sort = TRUE) %>%
ggplot(aes(x = reorder(Genre, -n), y = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45,hjust=1))
Here we looked at distribution of User Scores and Critic Scores as well as the average Critic and User Score over time.
videogames.clean %>% ggplot() +
geom_histogram(binwidth = 0.5,aes(x = Critic_Score, fill = 'pink') ) +
geom_histogram(binwidth = 0.5,aes(x = User_Score, fill = 'blue') )
We have a ton of publishers
videogames.clean %>% ggplot(aes(x = Year)) +
geom_bar()
vs_sales.byregion.byyear <- vs_byregion %>% group_by(Year, Region) %>% summarize(Sales = sum(Sales))
vs_sales.byregion.byyear %>% ggplot(aes(x=Year, y= Sales))+
geom_line(aes(color = Region))
videogames.clean %>% group_by(Year) %>% summarise(
User_Score = mean(User_Score, na.rm = T),
Critic_Score = mean(Critic_Score, na.rm = T),
Vgchartzscore = mean(Vgchartzscore, na.rm = T)) %>%
filter(Year >= 1989) %>%
mutate(User_Score2 = case_when(Year >= 1996 ~ User_Score,
TRUE ~ NaN)) %>%
gather(ScoreType, Score, c(User_Score,Critic_Score,Vgchartzscore), na.rm = T) %>%
ggplot(aes(x = Year)) + # TODO : Make look better
geom_line(aes(y = Score, color = ScoreType)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_x_continuous('ID', labels = 1980:2018, breaks = 1980:2018) +
xlab('ID') +
xlim(1989, 2018
)
videogames.clean <- videogames.clean[which(videogames.clean$Global_Sales < 17 | !is.na(videogames.clean$Global_Sales)), ]
⁃ construct CI for population mean value for sales
T-Distribution
confidence <- 0.95
n <- length(videogames.clean$Global_Sales)
mu.hat <- mean(videogames.clean$Global_Sales)
sd.hat <- sd(videogames.clean$Global_Sales)
se <- sd.hat/sqrt(n)
alpha <- 1-confidence
CI <- c(mu.hat - qt(1-alpha/2, n-1)*se,
mu.hat + qt(1-alpha/2, n-1)*se)
mu.hat
## [1] 0.3670216
CI
## [1] 0.3552499 0.3787933
Check assumptions: * The sample is not randomized (vgchartz’s game database does not include all games and would have a bias towards including games that are available in english) * The population sales distribution is not normal at all (extreme right skew) * The dataset had two extreme outliers identified via boxplot, and removed.
Although the t-distribution CI is robust against non-normal populations, it is highly sensitive to violations of the random sampling assumption. Since our dataset would be missing a disproportionate amount of non-western games, and older games. So we likely have an undercoverage issue using the t-distribution method if we consider our population to be all video games that ever existed worldwide. But if we consider our population to be ???????????? then the CI we have is trustworthy
We can also use bootstrap to estimate the 95% CI for the mean of video game sales. I would not expect this result to be significantly different, or better than the t-distribution method because bootstrapping is also sensitive to non-random sampling, because the assumption is that our sample is a good representation of the population we are intrested in.
Here we conducted a t test with 95% confidence intervals looking at sales for the Action Genre. We ignored any NA values. We can see that the mean video game sales does not fall into our 95% confidence intervals for Action game sales. Thus, we reject the null hypothesis.
#Ignored all NA values
actions <- videogames.clean %>% filter(Genre == 'Action')
t.test(actions$Global_Sales, conf.level = 0.95)
##
## One Sample t-test
##
## data: actions$Global_Sales
## t = 21.096, df = 2893, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.3780538 0.4555330
## sample estimates:
## mean of x
## 0.4167934
actionconf <- t.test(actions$Global_Sales, conf.level = 0.95)$conf.int
actionconf
## [1] 0.3780538 0.4555330
## attr(,"conf.level")
## [1] 0.95
# 0.3780538 0.4555330
#Null Hypothesis would be that mean video game sales falls between .378 and .456
avgsales<-mean(videogames.clean$Global_Sales,na.rm=T)
avgsales
## [1] 0.3670216
#0.3670216
#Mean does not fall into 95% confidence interval so we reject Null Hypothesis
Assumptions for Two-Sided Significance Test for Comparing Two Population Means: 1) A quantitative response variable for two groups - in this case is our sales which is quantitative. 2) Independent Random Samples - our data isn’t random 3) Approx. Normal Population for each group - Not true, the sales of video games has a major right skew as most of the video games sold do not sell very well. The majority of games sell less than $ 1 million US Dollars.
vs_byregion %>%
filter(Genre == "Sports") %>%
ggplot(aes(x=Sales)) +
geom_histogram()
vs_byregion %>%
filter(Genre == "Shooter") %>%
ggplot(aes(x=Sales)) +
geom_histogram()
SALES in SPORTS VS SHOOterS H_0: mean sales spots - mean sales shooter = 0 H_a: not equal 0
mean(vs_byregion$Sales[vs_byregion$Genre=="Sports"], na.rm=TRUE)
## [1] 0.2596627
mean(vs_byregion$Sales[vs_byregion$Genre=="Shooter"], na.rm=TRUE)
## [1] 0.3556595
Step 2: Calculate the test statistic Step 3: ??? Step 4: Obtain p-Value and profit
x1 <- vs_byregion %>%
filter(Genre=="Sports")
x2 <- vs_byregion %>%
filter(Genre=="Shooter")
x3 <- vs_byregion %>%
filter((Genre == "Sports") | (Genre == "Shooter"))
#t.test(vs_byregion$Sales~x3$Genre)